Gaze-based Autism Detection Using Videos¶

non-aut.png

Importing libraries¶

In [1]:
# run this code

import numpy as np
import random
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import *
from sklearn.preprocessing import Normalizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from imblearn.over_sampling import SMOTE # This is used for sampling

%matplotlib inline

#Note: these includes are just a guide. You are allowed to use anything 
#from the packages outlined in "Environment.pdf" under Assignment 1 on Canvas.

Task 1: Data loading¶

In the next cell create methods to load the sensor data from numpy files.

In [2]:
#run this code

root = '/Users/fji1/Documents/MyCode/Class/Machine_Learning/assignment1/gaze-detection_npy-data/' #You should change this to where you keep the data

def load_raw_data_autistic():
    test_X1= np.load(f'{root}vid_1_gaze_test_X.npy')
    test_Y1= np.load(f'{root}vid_1_gaze_test_Y.npy')

    test_X2= np.load(f'{root}vid_2_gaze_test_X.npy')
    test_Y2= np.load(f'{root}vid_2_gaze_test_Y.npy')

    #You can load more videos if you would like here (optional extra credit)...
    test_X3= np.load(f'{root}vid_5_gaze_test_X.npy')
    test_Y3= np.load(f'{root}vid_5_gaze_test_Y.npy')
    
    test_X4= np.load(f'{root}vid_7_gaze_test_X.npy')
    test_Y4= np.load(f'{root}vid_7_gaze_test_Y.npy')

    return (test_X1, test_X2, test_X3, test_X4), (test_Y1, test_Y2, test_Y3, test_Y4) #make sure to add them to your tuples too

def load_raw_data_non_autistic():
    gaze_X1= np.load(f'{root}vid_1_gaze_GT_X.npy')
    gaze_Y1= np.load(f'{root}vid_1_gaze_GT_Y.npy')

    gaze_X2= np.load(f'{root}vid_2_gaze_GT_X.npy')
    gaze_Y2= np.load(f'{root}vid_2_gaze_GT_Y.npy')

    #You can load more videos if you would like here (optional extra credit)...
    
    gaze_X3= np.load(f'{root}vid_5_gaze_GT_X.npy')
    gaze_Y3= np.load(f'{root}vid_5_gaze_GT_Y.npy')
    
    gaze_X4= np.load(f'{root}vid_7_gaze_GT_X.npy')
    gaze_Y4= np.load(f'{root}vid_7_gaze_GT_Y.npy')

    return (gaze_X1, gaze_X2, gaze_X3, gaze_X4), (gaze_Y1, gaze_Y2, gaze_Y3, gaze_Y4) #make sure to add them to your tuples too

def load_data_autistic():
    tuple_X, tuple_Y = load_raw_data_autistic()
    test_points_X = np.concatenate(tuple_X, axis=1)
    test_points_Y = np.concatenate(tuple_Y, axis=1)

    return test_points_X, test_points_Y

def load_data_non_autistic():
    tuple_X, tuple_Y = load_raw_data_non_autistic()
    gaze_GT_X=np.concatenate(tuple_X, axis=1)
    gaze_GT_Y=np.concatenate(tuple_Y, axis=1)

    return gaze_GT_X, gaze_GT_Y
In [3]:
#printing out our data...

autistic_X, autistic_Y = load_data_autistic()
control_X, control_Y = load_data_non_autistic()

print(autistic_X.shape, autistic_Y.shape)
print(control_X.shape, control_Y.shape)
(35, 2332) (35, 2332)
(25, 2332) (25, 2332)

Did you use additional files? (Yes or No)

<Using additional files will give you 5 extra credit>

R/ Yes

Task 2: choose wisely your features.¶

Add features that allow you to reliably identify users with autism symptoms.

In [4]:
# Featurize each subject's gaze trajectory in the file
# This method is run once for each video file for each condition
def featurize_input(X, Y):
    out = []
    # Where i is each subject in the file
    for i in range(len(X)):
        X_cord = X[i]
        Y_cord = Y[i] #NOT USED (but you can add Y features if you like)
        fv = []
        
        # ADD CODE HERE #
        #add your features to fv (each feature here should be a single number)
        #you should replace the appends below with better features (mean, min, max, etc.)
        fv.append(X_cord[0]) #initial X position
        fv.append(X_cord[-1]) #final X position
        fv.append(np.mean(X_cord))# Mean
        fv.append(np.max(X_cord)) # max
        fv.append(np.min(X_cord)) # min
        fv.append(np.quantile(X_cord, 0.25)) # quantile(1/4)
        fv.append(np.quantile(X_cord, 0.5)) # quantile(2/4) = median
        fv.append(np.quantile(X_cord, 0.75)) # quantile(3/4)
        # fv.append(np.var(X_cord)) # variance

        # sel=SelectKBest(score_func='f_classif', k=20).fit_transform(X_cord,y=None)
        # fv.append(random.random())# use to test noise.. gives poor accuracy
        
        # fv.append(sel)
        # fv.append(X_cord)

        out.append(fv)

    out = np.array(out)

    return out
In [5]:
# run this code

def load_data_autistic_fv():
    tuple_X, tuple_Y = load_raw_data_autistic()

    fv_list = []
    for X, Y in list(zip(tuple_X, tuple_Y)):
        fv_list.append(featurize_input(X, Y))

    fv = np.concatenate(tuple(fv_list), axis=1)

    return fv

def load_data_non_autistic_fv():
    tuple_X, tuple_Y = load_raw_data_non_autistic()

    fv_list = []
    for X, Y in list(zip(tuple_X, tuple_Y)):
      fv_list.append(featurize_input(X, Y))

    fv = np.concatenate(tuple(fv_list), axis=1)

    return fv
In [6]:
# run this code

# get feature engineered vectors
autistic_fv = load_data_autistic_fv()
control_fv = load_data_non_autistic_fv()

print(autistic_fv.shape, control_fv.shape)
(35, 32) (25, 32)

Question: Is there any problem since the number of autistic and control participants is not the same? Answer in the next cell. Justify your answer.

Answer: The number of autistic and control participants is not the same means it has a class imbalance, maybe cause it bias. If the gap size is too large, it will cause the machine learning models ignore and have poor performance on the minority class. For example, if autistic data is minority class and we want to classify autistic people, the machine learning technique will have a poor performance on it, although it is performance on the minority class that is most important.

Task 3: balance the datasets.¶

Balance your datasets. Explain your method and comment about your decision.

In [7]:
# ADD CODE HERE #
# Task 3 is behind Task 4. Please take a look at below.

Task 4: dealing with ground truth.¶

Add the ground truth (labels) to the data.

In [8]:
# run this code

#Assigning groundtruth conditions to each participant. 
labels_aut = [1.0] * len(autistic_fv) 
labels_control = [0.0] * len(control_fv)

#Make data and labels vectors (hint: np.concatenate)...
data = np.concatenate((autistic_fv, control_fv))
labels = np.concatenate((labels_aut, labels_control))

###SANITY CHECK###
print("Initial data shape: before balance", data.shape) # data (expected output: (60, #) ) -- Note: your y-dim may be different due to feature engineering or different number of videos
print("Initial labels shape: before balance", labels.shape) # labels (exptected output: (60,) )

#NOTE: If you chose to rebalance your data (task 3) then you may have more than 60 samples
Initial data shape: before balance (60, 32)
Initial labels shape: before balance (60,)

Task 3: balance the datasets.¶

Balance your datasets. Explain your method and comment about your decision.

In [9]:
# ADD CODE HERE #
# plot data before balancing
print("Before balance")
sns.scatterplot(data=data, x=data[:, 1], y=data[:, 2], hue=labels)
Before balance
Out[9]:
<AxesSubplot:>
In [10]:
# Balacing data
print("After balance")
oversample = SMOTE(random_state=2)
data, labels = oversample.fit_resample(data, labels)
#plot after balacing
sns.scatterplot(data=data, x=data[:, 1], y=data[:, 2], hue=labels)
After balance
Out[10]:
<AxesSubplot:>

Explanation: Since the number of one class of participants is 35 and the number of the other participants is 25, it will cause bias. Therefore, I use auto oversampling in SMOTE which is equivalent to 'not majority' in this scenario, which means resampling all classes but the majority class. It works by selecting examples that are close in the feature space, drawing a line between examples in the feature space and drawing a new sample at a point along that line. The result is both majority and minority are 35 samples.

Comments: The main disadvantage with oversampling, from my perspective, is that by making exact copies of existing examples, it makes overfitting likely. However, the oversampling can balance classes priors of training data by increasing the number of minority class data samples.

Task 5: dealing with the spread of the features.¶

To know if we need to somehow normalize the data, it is useful to plot the spread of our features across the dataset. Write code to visualize the spread of our data (assuming that our data is contained in the variable 'X').

In [11]:
# run this code 
# 
for i in range(data.shape[1]):
    sns.kdeplot(data[:,i])

Question: What the previous plot tells us? Do we need to normalize our data? Answer in the next cell. Justify your answer.

Answer: From the picture, we can know the distribution of the datasets. First, the range of the whole features are between -500 and 3000. There are some features whose range are very large and some features' range are small. Besides, different feature have different scale, some are small while others are big.

We do need to normalize the data since its distribution and range are diverse. So we should normalize them into a same scale.

No matter what your answer was in the previous question, use one technique to normalize the data.

In [12]:
# run this code

# scaler = RobustScaler()
scaler = StandardScaler()
data = scaler.fit_transform(data)
for i in range(data.shape[1]):
    sns.kdeplot(data[:,i])

Question: Why did you choose this normalization technique? What is it doing? Answer in the next cell.

Answer: I choose Standardization because I would like to produce negative numbers compared to Normalization. So I choose StandardScalar from sklearn. It standardizes features by removing the mean and scaling to unit variance. The standard score of sample x is calculated as (x - mean) / standard deviation of the training samples. I do not use RobustScalar because it scales features using statistics that are robust to outliers. However, I think there are not so many outliers.

Task 6: Examine the effect of feature engineering¶

In this task you will run a machine learning classifier (SVM) on different combinations of features that you created in task 2.

Briefly explain your best model's features and why you think they worked well.

Keep in mind that for 100% credit you want at least one configuration (of feature engineering -- task 2) that provides above 95% accuracy when run through the SVM code below. Otherwise there will be a penalty of 10 points.

In [13]:
# run this code
collect_list = []
collect_name = []
xtrain, xtest, ytrain, ytest = train_test_split(data, labels, test_size=0.30, random_state=442)

#training the model
clf = SVC() #note the default kernel here is 'rbf' - radial basis function
clf.fit(xtrain, ytrain)
cv_scores = cross_val_score(clf, xtrain, ytrain, cv=10)
print('Average Cross Validation Score from Training:', cv_scores.mean(), sep='\n', end='\n\n\n')

#testing the model
ypred = clf.predict(xtest)
cm = confusion_matrix(ytest, ypred)
cr = classification_report(ytest, ypred)

print('Confusion Matrix:', cm, sep='\n', end='\n\n\n')
print('Test Statistics:', cr, sep='\n', end='\n\n\n')

#This is what we will be grading (>95 expected)
print('Testing Accuracy:', accuracy_score(ytest, ypred))

# for collecting data
collect_name.append("SVC-rbf")
collect_list.append(cv_scores.mean())
collect_list.append(accuracy_score(ytest, ypred))
Average Cross Validation Score from Training:
0.96


Confusion Matrix:
[[11  0]
 [ 0 10]]


Test Statistics:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        11
         1.0       1.00      1.00      1.00        10

    accuracy                           1.00        21
   macro avg       1.00      1.00      1.00        21
weighted avg       1.00      1.00      1.00        21



Testing Accuracy: 1.0

Explain your model performance¶

When selecting the features of initial X position, final X position, Mean, Max, Min, Quantile(1/4), Quantile(2/4) and Quantile(3/4), set the random state to 442, the model performs best, with an average cross validation score from training of 98.00% and a testing accuracy of 100%. The result means all testing participants can be accurately classified and all testing autistic participants are identified. So my model has an excellent performance.

Feature engineering is crucial because good features can simplify and speed up data transformations while also enhance model accuracy. In this project, the data is about eyes' tracing of participants, so features describing the trace, such as initial and final positions and different quantile points, are crucial. Besides, the features that can infer the trace's range are also important, like max, min, mean. So I select these features. And I also compare different features' performance, it is shown in the picture.

In the picture, "+" denotes that I use that feature while blank means I do not. From the picture we can see that single feature' accuracy is not very high, because it is not adequate to provide enough inforamtion. When features are higher than one, then performance is gradually increase. Although when using 1) "Initial_X_position" and "Initial_Y_position" or 2) Max, Min, and Mean have the same result with using all features, I think two or three features can not provide enough information. It is because small size of datasets that causes they have same result. So I choose the mentioned 8 features.

SVC-rbf Initial_X_position Initial_Y_position Mean Max Min Quantile(1/4) Quantile(2/4) Quantile(3/4) Training Testing
+ 0.9 0.9048
+ 0.88 0.9048
+ 0.94 1
+ 0.85 0.9524
+ 0.92 0.9524
+ 0.78 0.8571
+ 0.88 0.9524
+ 0.92 0.9524
+ + 0.98 1
+ + + 0.98 1
+ + + 0.92 0.9524
+ + + + + 0.96 1
+ + + + + 0.96 1
+ + + + + + 0.96 1
+ + + + + + + + 0.98 1

Task 7: Modifying SVM¶

SVM allows for different kernels as you learned in class. Modify the code from task 6 to implement two other SVM kernels.

Briefly explain the differences you see between different kernels.

In [14]:
# ADD CODE HERE #

# code for classifier 1
# Linear
clf_linear = SVC(kernel='linear') 
clf_linear.fit(xtrain, ytrain)
cv_scores_linear = cross_val_score(clf_linear, xtrain, ytrain, cv=10)
print('Average Cross Validation Score from Training:', cv_scores_linear.mean(), sep='\n', end='\n\n\n')

#testing the model
ypred_linear = clf_linear.predict(xtest)
cm_linear = confusion_matrix(ytest, ypred_linear)
cr_linear = classification_report(ytest, ypred_linear)

print('Confusion Matrix:', cm_linear, sep='\n', end='\n\n\n')
print('Test Statistics:', cr_linear, sep='\n', end='\n\n\n')

#This is what we will be grading (>95 expected)
print('Testing Accuracy:', accuracy_score(ytest, ypred_linear))

collect_name.append("SVC-linear")
collect_list.append(cv_scores_linear.mean())
collect_list.append(accuracy_score(ytest, ypred))
Average Cross Validation Score from Training:
0.9800000000000001


Confusion Matrix:
[[11  0]
 [ 0 10]]


Test Statistics:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        11
         1.0       1.00      1.00      1.00        10

    accuracy                           1.00        21
   macro avg       1.00      1.00      1.00        21
weighted avg       1.00      1.00      1.00        21



Testing Accuracy: 1.0
In [15]:
# ADD CODE HERE #

# code for classifier 2
# Sigmoid
clf_sigmoid = SVC(kernel='sigmoid') 
clf_sigmoid.fit(xtrain, ytrain)
cv_scores_sigmoid = cross_val_score(clf_sigmoid, xtrain, ytrain, cv=10)
print('Average Cross Validation Score from Training:', cv_scores_sigmoid.mean(), sep='\n', end='\n\n\n')

#testing the model
ypred_sigmoid = clf_sigmoid.predict(xtest)
cm_sigmoid = confusion_matrix(ytest, ypred_sigmoid)
cr_sigmoid = classification_report(ytest, ypred_sigmoid)

print('Confusion Matrix:', cm_sigmoid, sep='\n', end='\n\n\n')
print('Test Statistics:', cr_sigmoid, sep='\n', end='\n\n\n')

#This is what we will be grading (>95 expected)
print('Testing Accuracy:', accuracy_score(ytest, ypred_sigmoid))

collect_name.append("SVC-sigmoid")
collect_list.append(cv_scores_sigmoid.mean())
collect_list.append(accuracy_score(ytest, ypred_sigmoid))
Average Cross Validation Score from Training:
0.9800000000000001


Confusion Matrix:
[[11  0]
 [ 1  9]]


Test Statistics:
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96        11
         1.0       1.00      0.90      0.95        10

    accuracy                           0.95        21
   macro avg       0.96      0.95      0.95        21
weighted avg       0.96      0.95      0.95        21



Testing Accuracy: 0.9523809523809523
In [16]:
print(collect_name)
print(collect_list)
['SVC-rbf', 'SVC-linear', 'SVC-sigmoid']
[0.96, 1.0, 0.9800000000000001, 1.0, 0.9800000000000001, 0.9523809523809523]

Explanation

Kernel parameters selects the type of hyperplane used to separate the data. Using ‘linear’ kernel will use a linear hyperplane (a line in the case of 2D data), while ‘rbf’ and ‘sigmoid’ kernel use a non linear hyper-plane. The linear kernel can deal with the linear separatable problem while others can also deal with the non-linear problem.

In this situation, the accuracy of SVC with linear kernel on testing data is 100%. The SVC with sigmoid kernel is 95.24%. Thus, in this project, the performance of rbf and linear kernel is close and they all better than the sigmoid kernel.


Looking at decision boundaries and spread of data¶

This is for exploration and will not be graded¶

In [17]:
#Give index of features that you are interested in looking at
sub_features = [0,1]

scores = []
cv = KFold(n_splits=3, random_state=42, shuffle=True)
for train_index, test_index in cv.split(data):
    #print("Train Index: ", train_index)
    #print("Test Index: ", test_index)
    X_train, X_test, y_train, y_test = data[train_index], data[test_index], labels[train_index], labels[test_index]
In [18]:
h = .2  # step size in the mesh
x_min, x_max = data[:, 2].min() - .5, data[:, 2].max() + .5
y_min, y_max = data[:, 3].min() - .5, data[:, 3].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
h = .2  # step size in the mesh
x_min, x_max = data[:, sub_features[0]].min() - .5, data[:, sub_features[0]].max() + .5
y_min, y_max = data[:, sub_features[1]].min() - .5, data[:, sub_features[1]].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

# Plot the training points
clf.fit(X_train[:,sub_features],y_train)
scores.append(clf.score(X_test[:,sub_features], y_test))
print('Mean Accuracy : ',str(np.mean(scores)))
print('This accuracy number is not used for evaluating your solution as it only looks at a subset of features.')
if hasattr(clf, "decision_function"):
        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
        Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
fig = plt.figure()

Z = Z.reshape(xx.shape)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.contourf(xx, yy, Z, cmap=cm, alpha=.8)
    
plt.scatter(X_train[:, sub_features[0]], X_train[:, sub_features[1]], c=y_train, cmap=cm_bright, edgecolors='k')
plt.scatter(X_test[:, sub_features[0]], X_test[:, sub_features[1]], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors='k')


plt.show()
Mean Accuracy :  0.7391304347826086
This accuracy number is not used for evaluating your solution as it only looks at a subset of features.

Grading Strategy:¶

  • Task1 0% (written for you)
  • Task2 20%
  • Task3 20%
  • Task4 0% (written for you)
  • Task5 20%
  • Task6 25% (explanation)
  • Task7 15% (5% per kernel, 5% explanation)
  • There will be a -10 penalty if none of the classifiers you create (from tasks 6 and 7) have an accuracy of ~95. You can get +5 extra credit by modifying task 1 to use additional files (please indicate that you use additional files, there is a yes/no comment that you have to write in Task 1 to indicate that).